home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Magnum One
/
Magnum One (Mid-American Digital) (Disc Manufacturing).iso
/
d12
/
asm_tut.arc
/
RXCOMP.C
< prev
next >
Wrap
Text File
|
1989-03-25
|
15KB
|
435 lines
/* rxgrm.c */
/* REGULAR EXPRESSION GRAMMAR */
Goal
-> RegularExpression <eof>
RegularExpression
-> AltExpr => rx_emit RXNT_ROOT
AltExpr
-> RegExpr
-> AltExpr '|' RegExpr => rx_emit RXNT_ALT
RegExpr
-> RXpr
-> ^ RXpr => rx_emit RXNT_RX_BEGIN
-> RXpr $ => rx_emit RXNT_RX_END
-> ^ RXpr $ => rx_emit RXNT_RX_BOTH
RXpr
-> Span
-> RXpr Span => rx_emit RXNT_RX
Span
-> TermList
-> Span .* => rx_emit RXNT_SPAN_0_FROM
-> Span .+ => rx_emit RXNT_SPAN_1_FROM
-> .* TermList => rx_emit RXNT_SPAN_0_TO
-> .+ TermList => rx_emit RXNT_SPAN_1_TO
-> Span .* TermList => rx_emit RXNT_SPAN_0
-> Span .+ TermList => rx_emit RXNT_SPAN_1
TermList
-> Term
-> TermList Term => rx_emit RXNT_TERM_LIST
Term
-> Primary * => rx_emit RXNT_STAR
-> Primary + => rx_emit RXNT_PLUS
-> Primary ? => rx_emit RXNT_BIN
-> Primary
-> . => rx_emit RXNT_ANY_C
Primary
-> Char
-> <esc> => rx_emit RXNT_ESC_C
-> ( AltExpr )
-> [ ClassList ] => rx_emit RXNT_CLASS
-> [^ ClassList ] => rx_emit RXNT_NOT_CLASS
ClassList
-> ClassItem
-> ClassList ClassItem => rx_emit RXNT_CLASS_LIST
ClassItem
-> Char - Char => rx_emit RXNT_RANGE
-> Char
-> <esc> => rx_emit RXNT_ESC_C
Char -> <c> => rx_emit RXNT_C
/* rx.h */
/* RX.H -- Header file for regular expression processing */
#define SPACE ' '
#define TAB '\t'
#define NL '\n'
#define NULLP (char *) 0
#define AND &&
#define OR ||
#define BACK_SLASH '\\'
#define isoctal(x) ((x) >= '0' AND (x) <= '7')
typedef enum {
ERROR_TOK, EOF_TOK, BEGIN_TOK, /* --- Token Types --- */
END_TOK, ALT_TOK, CHAR_TOK,
ANY_CHAR_TOK, ESCAPE_TOK, LPAREN_TOK,
RPAREN_TOK, LBRACKET_TOK, NOT_TOK,
RBRACKET_TOK, STAR_TOK, PLUS_TOK,
BIN_TOK, DASH_TOK, SPAN0_TOK,
SPAN1_TOK,
RXNT_ROOT, /* --- Node Types --- */
RXNT_RX, RXNT_RX_BEGIN, RXNT_RX_END,
RXNT_RX_BOTH, RXNT_SPAN_0, RXNT_SPAN_1,
RXNT_SPAN_0_TO, RXNT_SPAN_1_TO, RXNT_SPAN_0_FROM,
RXNT_SPAN_1_FROM, RXNT_ALT, RXNT_TERM_LIST,
RXNT_STAR, RXNT_PLUS, RXNT_BIN,
RXNT_C, RXNT_ANY_C, RXNT_ESC_C,
RXNT_RANGE, RXNT_CLASS, RXNT_CLASS_LIST,
RXNT_NOT_CLASS
} RX_SYM;
typedef struct RX_NODE { /* --- REG EXPR NODE --- */
RX_SYM type; /* Node type (see above) */
char c; /* Character */
char flag; /* Flags: */
#define RX_BEGIN 0x01 /* Match beginning of field */
#define RX_END 0x02 /* Match end of field */
struct RX_NODE *lnode; /* Ptr to left-node */
struct RX_NODE *rnode; /* Ptr to right-node */
} RXNODE;
#define RXNODE_0 (RXNODE *) 0
#ifdef RXTRNL /* rx_main has this #define'd. So has definitions */
char *token; /* Ptr to curr token. */
char *input; /* Ptr to char after curr token. */
char *T_beg; /* token-begin ptr */
char *T_end; /* token-end ptr */
RXNODE *rx_root = (RXNODE *) 0; /* Ptr to root node. */
int rx_match_pos = -1; /* Position of match. */
int rx_match_len = -1; /* Length of match. */
#else /* Other modules just contain declarations */
extern char *input, *token, *T_beg, *T_end;
extern RXNODE *rx_root;
extern int rx_match_pos, rx_match_len;
#endif /* RXTRNL */
int rx_parse (char *); /* --- Func Declarations --- */
void rx_print_tree (RXNODE *, int);
int rx_match (RXNODE *, char *);
int rx_scan (void);
/* rx_main.c */
/* RX_MAIN.C -- Driver program for the compiler */
#define RXTRNL
#include <string.h>
#include <stdio.h>
#include <dir.h>
#include <dos.h>
#include <rx.h> /* globals defined here because RXTRNL is #define'd */
int main (int argc, char **argv)
{
int i, rc, match_sw, lin;
FILE *stream;
char *rx, *cp;
char txt [258];
struct ffblk ffblk; /* For Turbo C "findfirst" & "findnext" functions. */
if (argc != 3) {
printf ("RX.EXE ----- Usage is:\n");
printf (" RX \"<regexpr>\" <filemask>\n");
printf ("Where <regexpr> is a regular expression, in double quotes.\n");
printf ("And <filemask> is a DOS filename specification, wildcards OK.\n");
exit (8);
}
rx = argv[1];
i = rx_parse (rx); /* Parse the reg. expr. */
if (i) {
printf ("Parser error = %d\n", i);
exit (4);
}
printf ("RegExpr ==> \"%s\"\n", rx);
rc = findfirst (argv[2], &ffblk, 0);
while (rc == 0) { /* Try to match lines */
match_sw = 0;
lin = 1;
stream = fopen (ffblk.ff_name, "r");
if (stream == (FILE *) 0) {
printf ("RX.EXE ----- Error OPENing data file '%s'.\n",
strupr(ffblk.ff_name));
exit(4);
}
printf ("File: %s\n", strupr(ffblk.ff_name));
txt[0]=0;
while (fgets (txt+1, 256, stream) != 0) {
if ((cp = strchr (txt+1, NL)) != NULLP)
*cp = 0;
/* ----- Attempt to match the reg.expr. ----- */
if (rx_match (rx_root, txt+1)) {
match_sw = 1;
printf ("%3d| %s\n", lin, txt+1);
}
lin++;
}
fclose (stream);
if (!match_sw)
printf ("No match occurred.\n");
rc = findnext (&ffblk);
}
return;
}
/* rx_scan.c */
/* RX_SCAN -- Lexical analyzer for regular expressions */
#include <ctype.h>
#include <rx.h>
static int in_brackets = 0; /* In class brackets ? */
int rx_scan (void)
{
token=input;
switch (*input++) {
case 0:
case NL: return EOF_TOK;
case '^': if (in_brackets) return CHAR_TOK;
return BEGIN_TOK;
case '$': if (in_brackets) return CHAR_TOK;
return END_TOK;
case '.': if (in_brackets) return CHAR_TOK;
if (*input=='*') { ++input; return SPAN0_TOK; }
if (*input=='+') { ++input; return SPAN1_TOK; }
return ANY_CHAR_TOK;
case '[': ++in_brackets;
if (*input=='^') { ++input; return NOT_TOK; }
else { return LBRACKET_TOK; }
case ']': --in_brackets; return RBRACKET_TOK;
case '|': if (in_brackets) return CHAR_TOK;
return ALT_TOK;
case '(': if (in_brackets) return CHAR_TOK;
return LPAREN_TOK;
case ')': if (in_brackets) return CHAR_TOK;
return RPAREN_TOK;
case '*': if (in_brackets) return CHAR_TOK;
return STAR_TOK;
case '+': if (in_brackets) return CHAR_TOK;
return PLUS_TOK;
case '-': if (in_brackets
AND isalnum(*(input-2))
AND isalnum(*input))
return DASH_TOK;
return CHAR_TOK;
case '?': if (in_brackets) return CHAR_TOK;
return BIN_TOK;
case BACK_SLASH:
if (*input=='b' OR *input=='f' OR *input=='n'
OR *input=='r' OR *input=='t') /* Std escape sequences */
{ ++input; return ESCAPE_TOK; }
if (isoctal(*input) AND isoctal(*(input+1))
AND isoctal(*(input+2))) /* Octal number */
{ input+=3; return ESCAPE_TOK; }
++input; /* Else, literal char */
++token;
return CHAR_TOK;
default:
return CHAR_TOK;
}
}
/* rx_emit.c */
/* RX_EMIT.C -- Emitter routines for abstract syntax tree. */
#include <rx.h>
#define RX_STK_SIZE 40 /* Arbitrary limits. Should be */
#define RX_NUM_NODES 200 /* enough for most cases, tho. */
static RXNODE *rxstk [RX_STK_SIZE]; /* Semantic stack for reg expr. */
static int idx = -1; /* Index into semantic stack. */
static RXNODE rxnodearea [RX_NUM_NODES]; /* Stg for RXNODEs. */
static RXNODE *rxnp = rxnodearea - 1; /* Pointer to last used node. */
static RXNODE *rxnp_max = rxnodearea + RX_NUM_NODES; /* End of node area. */
static RXNODE *make_rxnode (int, RXNODE *, RXNODE *); /* Func Decl. */
int rx_emit (int nodetype)
{
char c;
int i;
RXNODE *np;
switch (nodetype) {
case RXNT_ROOT: rx_root = rxstk[idx];
break;
case RXNT_RX:
case RXNT_STAR:
case RXNT_PLUS:
case RXNT_BIN:
case RXNT_CLASS:
case RXNT_NOT_CLASS:
/* --- The single line of code below does it all! 1). Create a new
node. 2). Pop the top node pointer off the semantic stack.
3). Attach it as the new node's left-node pointer. 4). Push
pointer to new node onto the stack. */
rxstk[idx] = make_rxnode (nodetype, rxstk[idx], RXNODE_0);
break;
case RXNT_RX_BEGIN:
case RXNT_RX_END:
case RXNT_RX_BOTH:
rx_emit (RXNT_RX); /* Note node-type changed to RXNT_RX */
if (nodetype == RXNT_RX_BEGIN
OR nodetype == RXNT_RX_BOTH)
rxstk[idx]->flag |= RX_BEGIN;
if (nodetype == RXNT_RX_END
OR nodetype == RXNT_RX_BOTH)
rxstk[idx]->flag |= RX_END;
break;
case RXNT_ALT:
case RXNT_TERM_LIST:
case RXNT_CLASS_LIST:
case RXNT_RANGE:
case RXNT_SPAN_0:
case RXNT_SPAN_1:
rxstk[idx-1] = make_rxnode (nodetype, rxstk[idx-1], rxstk[idx]);
--idx;
break;
case RXNT_SPAN_0_TO:
case RXNT_SPAN_1_TO:
rxstk[idx] = make_rxnode (nodetype, RXNODE_0, rxstk[idx]);
break;
case RXNT_SPAN_0_FROM:
case RXNT_SPAN_1_FROM:
rxstk[idx] = make_rxnode (nodetype, rxstk[idx], RXNODE_0);
break;
case RXNT_C:
rxstk[++idx] = make_rxnode (nodetype, RXNODE_0, RXNODE_0);
rxstk[idx]->c = *T_beg; /* Parser maintains "T_beg". */
break;
case RXNT_ANY_C:
rxstk[++idx] = make_rxnode (nodetype, RXNODE_0, RXNODE_0);
break;
case RXNT_ESC_C: /* Note node type changed to RXNT_C. */
np = rxstk[++idx] = make_rxnode (RXNT_C, RXNODE_0, RXNODE_0);
c = *(T_beg+1);
if (c == 'b') { np->c = '\b'; return 0; }
if (c == 'f') { np->c = '\f'; return 0; }
if (c == 'n') { np->c = '\n'; return 0; }
if (c == 'r') { np->c = '\r'; return 0; }
if (c == 't') { np->c = '\t'; return 0; }
if (isoctal(*(T_beg+1))
AND isoctal(*(T_beg+2))
AND isoctal(*(T_beg+3))) {
i = (*(T_beg+1) - '0') * 64;
i += (*(T_beg+2) - '0') * 8;
i += *(T_beg+3) - '0';
np->c = i;
}
else
np->c = *T_beg+1;
break;
default: exit (3); /* Something has gone wrong. */
}
if (idx >= RX_STK_SIZE) {
printf ("Node-pointer stack overflow.\n");
exit (4);
}
return 0;
}
static RXNODE *make_rxnode (int type, RXNODE *lnode, RXNODE *rnode)
{
if (++rxnp >= rxnp_max) { /* Alloc area for a new node */
printf ("Node area overflow\n");
exit (4);
}
rxnp->type = type;
rxnp->lnode = lnode;
rxnp->rnode = rnode;
rxnp->c = rxnp->flag = 0;
return (rxnp);
}
/* rx_match.c */
#include <string.h>
#include <rx.h>
static int match_node (RXNODE *, char *); /* Local function */
int rx_match (RXNODE *np, char *txt)
{
int i, j;
for (i=0; txt[i]; i++) {
j = match_node (np, txt+i);
if (j >= 0) {
rx_match_pos = i; /* A MATCH !!! */
rx_match_len = j;
return 1;
}
}
rx_match_pos = -1; /* No match. */
rx_match_len = -1;
return 0;
}
static int match_node (RXNODE *np, char *txt)
{
int i, j, n, dot;
if (np == (RXNODE *) 0) return 0;
switch (np->type) {
case RXNT_RX: /* Must match both sub-nodes. */
if (np->flag & RX_BEGIN
AND *(txt-1) != 0)
return -1;
i = match_node (np->lnode, txt);
if (i<0) return -1;
j = match_node (np->rnode, txt+i);
if (j<0) return -1;
if (np->flag & RX_END
AND *(txt+i+j+1) != 0)
return -1;
return i+j;
case RXNT_ALT: /* Match if either subnode matches. */
i = match_node (np->lnode, txt);
j = match_node (np->rnode, txt);
return i>j ? i : j; /* Return the longer matching length. */
case RXNT_TERM_LIST: /* Must match both subnodes. */
i = match_node (np->lnode, txt);
if (i<0) return -1;
j = match_node (np->rnode, txt+i);
if (j<0) return -1;
return i+j;
case RXNT_PLUS: /* Must match 1 or more occurance(s) of left-node. */
i = match_node (np->lnode, txt);
if (i<0) return -1;
/* ------ Fall into RXNT_STAR ----- */
case RXNT_STAR: /* Match 0 or more occurance(s) of left-node. */
n = (np->type == RXNT_PLUS ? i : 0); /* Account for fall-in. */
while (1) {
i = match_node (np->lnode, txt+n);
if (i<=0) return n;
n += i;
}
case RXNT_BIN: /* Must match 0 or 1 occurance of left-node. */
i = match_node (np->lnode, txt);
return i<0 ? 0 : i;
case RXNT_RANGE: /* Must be in range, inclusive. */
return (*txt < np->lnode->c OR *txt > np->rnode->c) ? -1 : 1;
case RXNT_CLASS: /* Must match left-node. */
i = match_node (np->lnode, txt);
return i>0 ? i : -1;
case RXNT_CLASS_LIST: /* Must match either subnode. */
i = match_node (np->lnode, txt);
if (i>0) return i;
j = match_node (np->rnode, txt);
if (j>0) return j;
return -1;
case RXNT_NOT_CLASS: /* Must not match left-node. */
i = match_node (np->lnode, txt);
return i>0 ? -1 : 1;
case RXNT_SPAN_0:
case RXNT_SPAN_1:
i = match_node (np->lnode, txt);
if (i<0) return -1;
if (i >= strlen(txt) AND np->type == RXNT_SPAN_1)
return -1;
dot = (np->type == RXNT_SPAN_1 ? 1 : 0);
for (n=strlen(txt+i+dot); n>=0; n--) {
j = match_node (np->rnode, txt+i+dot+n);
if (j>=0) return i+dot+n+j;
}
return -1;
case RXNT_SPAN_0_TO:
case RXNT_SPAN_1_TO:
dot = (np->type == RXNT_SPAN_1_TO ? 1 : 0);
for (n=strlen(txt+dot); n>=0; n--) {
i = match_node (np->rnode, txt+dot+n);
if (i>=0) return dot+n+i;
}
return -1;
case RXNT_SPAN_0_FROM:
case RXNT_SPAN_1_FROM:
i = match_node (np->lnode, txt);
if (i<0) return -1;
if (i >= strlen(txt) AND np->type == RXNT_SPAN_1_FROM)
return -1;
return strlen(txt);
case RXNT_C: /* Must match the character. */
return *txt == np->c ? 1 : -1;
case RXNT_ANY_C: /* Matches anything except end-of-string. */
return *txt ? 1 : -1;
default: /* "I don't think we're in Kansas anymore, Toto." */
exit (1);
}
}